Setup:
knitr::opts_chunk$set(echo = TRUE, max.print = 100)
Load libraries:
library(readxl)
library(data.table)
library(tidytable)
## As of tidytable v0.9.0 dotless versions of functions are exported.
## You can now use `arrange()`/`mutate()`/etc. directly.
##
## Attaching package: 'tidytable'
## The following objects are masked from 'package:data.table':
##
## between, first, fread, last
## The following objects are masked from 'package:stats':
##
## dt, filter, lag
## The following object is masked from 'package:base':
##
## %in%
library(stringr)
library(stringi)
library(Cairo)
library(geosphere)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:tidytable':
##
## arrange, distinct, filter, group_by, mutate, rename, select, slice,
## summarise, transmute, ungroup
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(geodata)
## Loading required package: terra
## terra 1.5.21
##
## Attaching package: 'terra'
## The following object is masked from 'package:ggplot2':
##
## arrow
## The following object is masked from 'package:tidytable':
##
## extract
## The following object is masked from 'package:data.table':
##
## shift
library(plyr)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:tidytable':
##
## arrange, count, desc, mutate, rename, summarise, summarize
library(tibble)
##
## Attaching package: 'tibble'
## The following object is masked from 'package:tidytable':
##
## enframe
library(forcats)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following objects are masked from 'package:tidytable':
##
## map, map_chr, map_dbl, map_df, map_dfc, map_dfr, map_int, map_lgl,
## map2, map2_chr, map2_dbl, map2_df, map2_dfc, map2_dfr, map2_int,
## map2_lgl, pmap, pmap_chr, pmap_dbl, pmap_df, pmap_dfc, pmap_dfr,
## pmap_int, pmap_lgl, walk
## The following object is masked from 'package:data.table':
##
## transpose
library(Rfast)
## Loading required package: Rcpp
## Loading required package: RcppZiggurat
##
## Attaching package: 'Rfast'
## The following objects are masked from 'package:purrr':
##
## is_integer, transpose
## The following object is masked from 'package:tidytable':
##
## nth
## The following object is masked from 'package:data.table':
##
## transpose
Load Phonotacticon:
Phonotacticon <- read_xlsx("Phonotacticon.xlsx") %>%
as.data.table()
Phonotacticon
Load PhonoBib:
PhonoBib <- read_xlsx("PhonoBib.xlsx") %>%
as.data.table()
PhonoBib
Load PanPhon and sort out duplicate rows:
PanPhon <- fread("PanPhonPhonotacticon1_0.csv") %>%
as.data.table() %>%
unique(by = 'ipa')
PanPhon
Arrange PanPhon segments in alphabetical order:
PanPhonOrder <- PanPhon$ipa[
order(-nchar(PanPhon$ipa),
PanPhon$ipa)]
head(PanPhonOrder)
## [1] "h͡d̪͡ɮ̪ʲʷ" "h͡d̪͡ɮ̪ʷː" "h͡d̪͡ɮ̪ʷˠ" "h͡d̪͡ɮ̪ʷˤ" "h͡d̪͡z̪ʲʷ" "h͡d̪͡z̪ʷː"
Create PanPhon regex:
PanPhonRegex <- paste0("(?:",
paste(PanPhonOrder, collapse="|"),
'|B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z',
")")
str_trunc(PanPhonRegex, 100)
## [1] "(?:h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪ʷˤ|h͡t̪͡ɬ̪ʲʷ|h͡t̪..."
Create PanPhon regex including brackets:
PanPhonRegexBrackets <- paste0('(?:',
'(?<=\\[).*?(?=\\])|',
paste(PanPhonOrder, collapse="|"),
'|B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z',
')')
str_trunc(PanPhonRegexBrackets, 100)
## [1] "(?:(?<=\\[).*?(?=\\])|h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪..."
Define classes:
Classes <- PanPhon %>%
mutate(B = cons == 1 & lab == 1,
C = cons == 1,
Č = cons == 1 & delrel == 1 & son == -1,
`F` = cons == 1 & cont == 1 & son == -1,
G = grepl('j|w|ɥ|ɰ', ipa),
Ł = cons == 1 & cor == 1 & lat == 1,
L = cons == 1 & cont == 1 & cor == 1 & son == 1,
N = nas == 1 & syl == -1,
O = cont == 1 & son == 1 & syl == -1 & !grepl('h|ɦ', ipa),
P = cons == 1 & cont == -1 & delrel == -1 & son == -1,
R = cons == 1 & cont == 1 & cor == 1 & lat == -1 & son == 1,
S = cons == 1 & cont == 1 & cor == 1 & son == -1,
`T` = cons == 1 & son != 1,
V = cons == -1 & cont == 1 & son == 1 & syl == 1,
W = syl == -1 & voi == 1,
X = syl == -1 & voi == -1,
Z = cont == 1 & syl == -1) %>%
select(ipa, B, C, Č, `F`, G, Ł, L, N, O, P, R, S, `T`, V, W, X, Z) %>%
melt(id.vars = 'ipa',
variable.name = 'Class',
value.name = 'Value') %>%
filter(Value) %>%
select(-Value)
Classes
Subset Eurasian lects that are complete in Phonotacticon:
Eurasia <- Phonotacticon %>%
filter(Macroarea == 'Eurasia',
Complete,
complete.cases(P),
O != '?',
N != '?',
C != '?',
!grepl('C{2,}', O),
!grepl('C{2,}', C),
!grepl('\\[.{10}.*?\\]\\[.{10}.*?\\]', O),
!grepl('\\[.{10}.*?\\]\\[.{10}.*?\\]', C)) %>%
select(-Note, -Complete, -Macroarea)
Eurasia[1,]
Extract phonemes:
Phonemes <- stri_extract_all_regex(Eurasia$P,
pattern = PanPhonRegex,
simplify = TRUE) %>%
as.data.table() %>%
mutate(Lect = Eurasia$Lect) %>%
melt(id.vars = 'Lect',
variable.name = 'Number',
value.name = 'ipa') %>%
select(-Number) %>%
filter(ipa != '')
Phonemes
Subset lect, onsets, nuclei, and codas:
LectONC <- Eurasia %>%
select(Lect, O, N, C) %>%
melt(id.vars = 'Lect',
variable.name = 'Category',
value.name = 'Sequence')
LectONC
Extract the sequences:
Sequences <- LectONC[, tstrsplit(Sequence, ' ', fixed = FALSE)] %>%
mutate(Lect = LectONC$Lect,
Category = LectONC$Category) %>%
melt(id.vars = c('Lect', 'Category'),
variable.name = 'Number',
value.name = 'Sequence') %>%
select(-Number) %>%
filter(!is.na(Sequence)) %>%
distinct()
Sequences
Subset sequences including capital letters:
Capitals <-
Sequences %>%
filter(grepl('B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z', Sequence)) %>%
select(-Category) %>%
distinct()
Capitals
Convert the capitals into the corresponding phonemes:
Decapitalized <-
stri_extract_all_regex(Capitals$Sequence,
pattern = PanPhonRegex,
simplify = TRUE) %>%
as.data.table() %>%
mutate(Lect = Capitals$Lect,
Sequence = Capitals$Sequence) %>%
melt(id.vars = c('Lect', 'Sequence'),
variable.name = 'Order',
value.name = 'Segment') %>%
mutate(Order = Order %>%
as.factor() %>%
as.integer()) %>%
filter(Segment != '') %>%
left_join(Classes, by = c('Segment' = 'Class')) %>%
mutate(ipa = if_else(is.na(ipa), Segment, ipa)) %>%
select(-Segment) %>%
inner_join(Phonemes) %>%
setorder(col = Order) %>%
split(by = c('Lect', 'Sequence')) %>%
lapply(function(x)
split(x, by = 'Order')) %>%
lapply(function(x)
lapply(x, function(x)
x <- x$ipa)) %>%
lapply(function(x)
expand.grid(x) %>%
do.call(what = paste0)) %>%
enframe() %>%
unnest() %>%
as.data.table() %>%
separate(col = name,
into = c('Lect', 'Sequence'),
sep = '\\.') %>%
setnames('value', 'NewSequence') %>%
full_join(Sequences) %>%
mutate(Sequence =
if_else(!is.na(NewSequence),
NewSequence,
Sequence)) %>%
select(-NewSequence)
Decapitalized
Split the sequences into segments:
ToUnbracket <- stri_extract_all_regex(Decapitalized$Sequence,
pattern = PanPhonRegexBrackets,
simplify = TRUE) %>%
as.data.table() %>%
mutate(Lect = Decapitalized$Lect,
Category = Decapitalized$Category,
Sequence = Decapitalized$Sequence) %>%
melt(id = c('Lect', 'Category', 'Sequence'),
variable.name = 'Order',
value.name = 'ipa') %>%
mutate(Order = Order %>%
as.factor() %>%
as.integer()) %>%
filter(ipa != "")
ToUnbracket
Subset bracketed sequences:
Bracketed <- ToUnbracket %>%
filter(grepl('\\[', Sequence))
Bracketed
Convert the bracketed sequences into all possible sequences:
Unbracketed <- Bracketed$ipa %>%
stri_extract_all_regex(pattern = PanPhonRegex, simplify = TRUE) %>%
as.data.table() %>%
mutate(Sequence = Bracketed$Sequence,
Order = Bracketed$Order) %>%
melt(id.vars = c('Sequence', 'Order'),
variable.name = 'Number',
value.name = 'ipa') %>%
filter(ipa != '') %>%
select(-Number) %>%
setorder(col = Order) %>%
split(by = 'Sequence') %>%
lapply(function(x)
split(x, by = 'Order')) %>%
lapply(function(x)
lapply(x, function(x)
x <- x$ipa)) %>%
lapply(function(x)
expand.grid(x) %>%
do.call(what = paste0)) %>%
enframe() %>%
unnest() %>%
setnames(c('name', 'value'),
c('Sequence', 'NewSequence'))
Unbracketed
Split the sequences into segments:
Segments <-
stri_extract_all_regex(
Unbracketed$NewSequence,
pattern = PanPhonRegex,
simplify = TRUE) %>%
as.data.table() %>%
mutate(Sequence = Unbracketed$Sequence,
NewSequence = Unbracketed$NewSequence) %>%
pivot_longer(cols = -c(Sequence, NewSequence),
names_to = 'Order',
values_to = 'NewIPA') %>%
mutate(Order = Order %>%
as.factor() %>%
as.integer()) %>%
filter(NewIPA != '') %>%
full_join(ToUnbracket) %>%
mutate(Sequence =
if_else(
!is.na(NewSequence),
NewSequence,
Sequence),
ipa =
if_else(
!is.na(NewIPA),
NewIPA,
ipa)) %>%
select(-NewSequence, -NewIPA)
Segments
Measure the length of each sequence:
Sequences_length <-
Segments[, .(Length = max(Order)), .(Lect, Category, Sequence)]
Sequences_length
Join the length of each sequence to segments:
Segments <- left_join(Segments, Sequences_length)
Segments
Count the maximal length:
MaxLength <- max(Sequences_length$Length)
MaxLength
## [1] 6
Count the number of split segments:
Segments_number <- nrow(Segments)
Segments_number
## [1] 44593
Assign different positions to each sequence:
Sequences_rep <- bind_rows(rep(list(Segments), MaxLength))
Sequences_rep <- Sequences_rep %>%
mutate(Position = rep(0:(MaxLength - 1),
each = Segments_number)) %>%
mutate(Order = Order + Position) %>%
filter(Length + Position <= MaxLength) %>%
select(-Length)
Sequences_rep
Join segments with their phonological features:
Sequences_features <- Sequences_rep %>%
left_join(PanPhon, by = 'ipa') %>%
melt(id = c('Lect',
'Category',
'Sequence',
'Order',
'ipa',
'Position'),
variable.name = 'Feature',
value.name = 'Value') %>%
mutate(Feature = paste0(Feature, Order)) %>%
dcast(Lect + Category + Sequence + Position ~ Feature,
value.var = 'Value',
fun.aggregate = sum,
fill = 0) %>%
mutate(SequencePosition = paste0(Sequence, Position)) %>%
select(-Lect, -Category, -Position, -Sequence) %>%
distinct()
Sequences_features
Calculate the distance between sequences:
Sequences_distance <- Sequences_features %>%
select(-SequencePosition) %>%
Dist() %>%
as.data.table()
Sequences_distance[1:5, 1:5]
Define a vector of all sequences:
SequenceVectors <-
str_replace(Sequences_features$SequencePosition, '[0-9]', '')
head(SequenceVectors)
## [1] "bl" "bl" "bl" "bl" "bl" "d"
Name the rows and the columns of the distance matrix:
setnames(Sequences_distance, SequenceVectors)
Sequences_distance[, Sequence.x := SequenceVectors]
Sequences_distance[1:5, 1:5]
Calculate the minimal distance per each sequence pair:
Sequences_MinDistance <-
Sequences_distance %>%
melt(id.vars = 'Sequence.x',
variable.name = 'Sequence.y',
value.name = 'Distance') %>%
as.data.table() %>%
.[, .(Distance = min(Distance)),
by = .(Sequence.x, Sequence.y)]
Sequences_MinDistance
Subset the distance of /pl/ and other segments:
pl <- Sequences_MinDistance %>%
filter(Sequence.x == 'pl',
Sequence.y != '∅') %>%
arrange(Distance)
pl
Subset the distance of /ia/ and other segments:
ia <- Sequences_MinDistance %>%
filter(Sequence.x == 'ia',
Sequence.y != '∅') %>%
arrange(Distance)
ia
Define new sequences with new (decapitalized and unbracketed) sequences:
NewSequences <- Segments %>%
select(Lect, Category, Sequence) %>%
as.data.table()
NewSequences
Compare the difference between two lects within the same category:
ONC_distance <- NewSequences %>%
left_join(NewSequences, by = 'Category') %>%
left_join(Sequences_MinDistance,
by = c('Sequence.x', 'Sequence.y')) %>%
.[, .(Distance = min(Distance)),
by = .(Lect.x, Lect.y, Sequence.x, Category)] %>%
.[, .(Distance = mean(Distance)),
by = .(Lect.x, Lect.y, Category)] %>%
mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
'vs.',
pmax(Lect.x, Lect.y),
sep = ' ')) %>%
.[, .(Distance = max(Distance)),
by = .(Lect_vs_Lect, Category)] %>%
dcast(., Lect_vs_Lect ~ Category, value.var = 'Distance')
ONC_distance
Count the number of tones in each lect:
Tones <- Eurasia %>%
select(Lect, `T`) %>%
mutate(`T` = gsub("\\-", NA, `T`)) %>%
mutate(`T` = str_count(`T`, " ") + 1)
Tones[is.na(Tones$`T`),]$`T` <- 0
Tones
Calculate the Canberra distance between the numbers of tones of each pair of lects:
Tones_distance <- dist(Tones, method = 'canberra') %>%
as.matrix() %>%
as.data.table() %>%
setnames(Tones$Lect) %>%
mutate(Lect = Tones$Lect) %>%
melt(id = 'Lect',
variable.name = 'Lect2',
value.name = 'T') %>%
mutate(`T` = replace_na(`T`, 0)) %>%
mutate(Lect_vs_Lect = str_c(pmin(as.character(Lect), as.character(Lect2)),
'vs.',
pmax(as.character(Lect), as.character(Lect2)),
sep = ' ')) %>%
select(Lect_vs_Lect, `T`) %>%
distinct()
## Warning in dist(Tones, method = "canberra"): NAs introduced by coercion
Tones_distance
Join segmental distance with tonal distance:
ONCT_distance <- ONC_distance %>%
full_join(Tones_distance)
ONCT_distance
Calculate the overall distance (PhonoDist):
PhonoDist <- ONCT_distance %>%
select(-Lect_vs_Lect) %>%
scale() %>%
as.data.table() %>%
mutate(Lect_vs_Lect = ONCT_distance$Lect_vs_Lect) %>%
mutate(Distance = sqrt((O - min(O)) ^ 2 +
(N - min(N)) ^ 2 +
(C - min(C)) ^ 2 +
(`T` - min(`T`)) ^ 2)) %>%
select(Lect_vs_Lect, Distance)
PhonoDist
Split the Lect_vs_Lect column of PhonoDist into Lect.x and Lect.y:
Lect_vs_Lect <- str_split_fixed(
PhonoDist$Lect_vs_Lect, ' vs. ', n = 2) %>%
as.data.table() %>%
setnames(c('Lect.x', 'Lect.y'))
Lect_vs_Lect
Visualize the phonological distances via multidimensional scaling:
PhonoScale <- PhonoDist %>%
bind_cols(Lect_vs_Lect) %>%
select(-Lect_vs_Lect) %>%
dcast(Lect.x ~ Lect.y, value.var = 'Distance') %>%
column_to_rownames('Lect.x') %>%
t() %>%
as.dist() %>%
cmdscale(k = nrow(Eurasia) - 1) %>%
as.data.frame() %>%
rownames_to_column('Lect') %>%
as.data.table()
## Warning in cmdscale(., k = nrow(Eurasia) - 1): only 123 of the first 208
## eigenvalues are > 0
PhonoScale[1:10, 1:10]
Cluster PhonoScale into the two groups:
K2 <- PhonoScale %>%
select(-Lect) %>%
kmeans(2) %>%
pluck(1) %>%
as_factor()
K2
## [1] 2 2 1 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 1 2 1 2 1 1 2 1 2 2 2 1 1 1 1 2
## [38] 1 2 1 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 2 2 1 1 2 1 2 2 1 1 1 2 1 1 2 1 2 1 1
## [75] 1 2 1 1 1 2 2 1 1 2 1 2 2 1 2 1 1 2 1 1 2 1 1 1 1 2 2 1 2 2 1 1 1 1 1 1 1
## [112] 1 2 1 2 1 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 2 1 2 1 2 2 1 1 1 1 2 1 2
## [149] 1 1 1 1 2 1 2 2 1 1 1 2 1 2 1 2 2 1 1 2 2 1 2 2 1 1 1 1 1 1 2 2 2 1 2 1 1
## [186] 1 1 1 1 1 2 1 1 1 1 2 2 2 2 2 1 1 2 2 2 2 2 1 2
## Levels: 1 2
Cluster PhonoScale into the five groups:
K5 <- PhonoScale %>%
select(-Lect) %>%
kmeans(5) %>%
pluck(1) %>%
as_factor()
K5
## [1] 4 1 5 5 4 2 3 5 3 1 2 2 3 2 2 5 1 1 1 5 4 5 4 2 4 2 3 4 2 4 5 4 1 5 2 3 5
## [38] 2 4 3 4 2 1 2 3 4 5 2 5 3 4 1 4 3 1 4 4 1 2 1 3 5 4 1 3 2 1 1 3 4 2 5 2 2
## [75] 2 4 3 1 1 4 5 3 2 1 2 4 5 1 5 2 3 4 2 2 4 1 2 2 1 5 1 3 5 5 3 3 3 3 1 1 2
## [112] 3 4 5 5 2 4 4 1 1 1 2 1 1 5 2 4 4 2 3 3 2 3 5 5 4 2 5 3 5 4 2 3 3 1 4 3 1
## [149] 2 2 3 2 4 2 5 4 1 1 2 4 2 5 3 1 5 2 2 1 2 1 5 4 2 3 2 2 5 2 4 5 5 2 4 1 2
## [186] 2 2 2 2 2 5 3 3 2 3 5 5 5 4 4 5 3 4 4 5 5 4 5 4
## Levels: 1 2 3 4 5
Cluster PhonoScale into the ten groups:
K10 <- PhonoScale %>%
select(-Lect) %>%
kmeans(10) %>%
pluck(1) %>%
`-`(1) %>%
as_factor()
K10
## [1] 4 9 3 6 7 1 0 3 5 9 2 1 3 2 1 3 5 5 5 8 4 8 7 2 4 2 5 6 1 7 8 7 9 3 2 5 8
## [38] 1 7 0 7 2 5 1 0 4 8 2 3 0 7 5 6 3 5 4 4 5 2 9 0 8 4 5 0 1 9 5 0 7 1 8 2 2
## [75] 2 4 0 9 5 7 6 3 2 2 2 4 6 5 8 5 5 4 2 2 6 5 2 5 5 8 2 3 8 8 5 3 3 0 5 2 2
## [112] 5 7 3 8 2 4 7 9 9 5 2 5 5 8 2 7 6 2 0 3 2 3 8 8 7 2 8 0 6 7 2 1 3 5 4 3 5
## [149] 2 2 0 2 7 2 7 6 5 5 2 4 2 6 3 9 8 2 2 5 8 5 8 4 2 3 2 2 2 2 7 8 8 2 7 5 2
## [186] 2 2 1 2 2 8 3 0 2 3 6 8 6 4 4 6 3 7 7 8 8 4 3 7
## Levels: 0 1 2 3 4 5 6 7 8 9
Subset lects and their coordinates:
Lect_LonLat <- PhonoBib %>%
filter(Lect %in% Eurasia$Lect) %>%
select(Lect, lon, lat)
Lect_LonLat
Add clusters to PhonoScale:
PhonoClusters <- PhonoScale %>%
mutate(K2 = K2,
K5 = K5,
K10 = K10) %>%
select(Lect, K2, K5, K10) %>%
left_join(Lect_LonLat)
PhonoClusters
Load map data:
map <- map_data("world")
head(world)
##
## 1 function (resolution = 5, level = 0, path, version = "latest",
## 2 ...)
## 3 {
## 4 stopifnot(level[1] == 0)
## 5 resolution = round(resolution[1])
## 6 stopifnot(resolution %in% 1:5)
Create a map of Eurasia:
EurasiaMap <- ggplot(map, aes(x = long, y = lat)) +
geom_polygon(aes(group = group),
fill = "white",
color = "darkgrey",
size = 0.2) +
coord_map("ortho",
orientation = c(20, 70, 0),
xlim = c(10, 130),
ylim = c(0, 90)) +
theme_void()
EurasiaMap
Assign the two clusters on the map:
PhonoK2 <- EurasiaMap +
geom_text(aes(x = lon,
y = lat,
label = K2,
color = K2),
data = PhonoClusters,
show.legend = FALSE) +
theme(legend.position = 'bottom')
cairo_pdf(file = 'PhonoK2.pdf',
width = 6,
height = 4.5,
family = 'Times New Roman')
PhonoK2
dev.off()
## quartz_off_screen
## 2
PhonoK2
Assign the five clusters on the map:
PhonoK5 <- EurasiaMap +
geom_text(aes(x = lon,
y = lat,
label = K5,
color = K5),
data = PhonoClusters,
show.legend = FALSE) +
theme(legend.position = 'bottom')
cairo_pdf(file = 'PhonoK5.pdf',
width = 6,
height = 4.5,
family = 'Times New Roman')
PhonoK5
dev.off()
## quartz_off_screen
## 2
PhonoK5
Assign the ten clusters on the map:
PhonoK10 <- EurasiaMap +
geom_text(aes(x = lon,
y = lat,
label = K10,
color = K10),
data = PhonoClusters,
show.legend = FALSE) +
theme(legend.position = 'bottom')
cairo_pdf(file = 'PhonoK10.pdf',
width = 6,
height = 4.5,
family = 'Times New Roman')
PhonoK10
dev.off()
## quartz_off_screen
## 2
PhonoK10
Subset the first three dimensions of the multidimensional scaling and join the K-means clustering:
ThreeD <- PhonoScale %>%
select(Lect, V1, V2, V3) %>%
left_join(PhonoClusters)
head(ThreeD)
Draw a 3D plot of multidimensional scaling:
ThreeDPlot <- plot_ly(
data = ThreeD,
x = ~V1,
y = ~V2,
z = ~V3,
color = ~K5,
text = ~Lect,
textfont = list(size = 15),
type = 'scatter3d',
mode = 'text') %>%
layout(showlegend = FALSE)
ThreeDPlot
Create a tibble of lects, coordinates, families, and a dummy column (kilometers)
Lect_Kilometers <- Lect_LonLat %>%
left_join(PhonoBib) %>%
select(Lect, lon, lat, Family) %>%
mutate(Kilometers = 'Kilometers')
Lect_Kilometers
Create Lect vs. Lect pairs:
Coordinates <- Lect_Kilometers %>%
full_join(Lect_Kilometers, by = 'Kilometers') %>%
mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
"vs.",
pmax(Lect.x, Lect.y),
sep = " "),
Crossfamilial = Family.x != Family.y)
Coordinates
Subset Coordinates x:
Coordinates.x <- select(Coordinates, lon.x, lat.x)
Coordinates.x
Subset Coordinates y:
Coordinates.y <- select(Coordinates, lon.y, lat.y)
Coordinates.y
Calculate the geographical distances between two columns of coordinates:
GeoDist <- Coordinates %>%
mutate(Kilometers =
distHaversine(Coordinates.x, Coordinates.y) / 1000) %>%
filter(Crossfamilial) %>%
select(Lect_vs_Lect, Kilometers) %>%
distinct()
GeoDist
Join the phonological distances and geographical distances:
PhonoGeoDist <- left_join(GeoDist, PhonoDist, by = 'Lect_vs_Lect')
PhonoGeoDist
Linear regression between geographical distance and phonological distances:
PhonoGeoDist %>%
lm(formula = Distance ~ Kilometers) %>%
summary()
##
## Call:
## lm(formula = Distance ~ Kilometers, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9728 -0.6582 -0.0037 0.6696 4.0808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.395e+00 1.433e-02 306.63 <2e-16 ***
## Kilometers 1.281e-04 3.128e-06 40.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01 on 18174 degrees of freedom
## Multiple R-squared: 0.08448, Adjusted R-squared: 0.08443
## F-statistic: 1677 on 1 and 18174 DF, p-value: < 2.2e-16
PhonoGeoDist
Visualize the linear regression:
PhonoLM <-
ggplot(aes(x = Kilometers, y = Distance), data = PhonoGeoDist) +
geom_point(alpha = 0.1) +
geom_smooth(formula = y ~ x, method = 'lm', color = 'red') +
theme_classic() +
scale_x_continuous(name = 'Geographical distance (km)') +
scale_y_continuous(name = 'Phonological distance (z)')
cairo_pdf(file = 'PhonoLM.pdf',
width = 4,
height = 3,
family = 'Times New Roman')
PhonoLM
dev.off()
## quartz_off_screen
## 2
PhonoLM